# Load the dplyr package
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Read an RDS file and assign it to a variable
df <- readRDS("exgr_test.rds")
# View the data
head(df)
To find the count of unique transcripts, we identify the unique transcript IDs within the dataset. Each transcript is uniquely represented by its transcript ID
unique_transcripts <- length(unique(df$transcript_id))
unique_transcripts
## [1] 234485
As you can see there are 234485 transcripts in the entire datatset
##Task 2: Counting Unique Exons ### To determine the number of unique exons, we follow these steps: 1. Group the data by transcript IDs. 2. Calculate the count of exons within each transcript. 3. Sum up the counts from all transcripts to obtain the total count of unique exons.
exon_counts_per_transcript <- df %>%
group_by(transcript_id) %>%
summarize(group_count = n())
# View the result
print(exon_counts_per_transcript)
## # A tibble: 234,485 × 2
## transcript_id group_count
## <int> <int>
## 1 1 3
## 2 2 6
## 3 3 3
## 4 4 2
## 5 5 1
## 6 6 1
## 7 7 3
## 8 8 1
## 9 9 3
## 10 10 1
## # … with 234,475 more rows
# Calculate the sum of group counts
total_unique_exons <- exon_counts_per_transcript %>%
summarize(sum_group_count = sum(group_count))
print(total_unique_exons)
## # A tibble: 1 × 1
## sum_group_count
## <int>
## 1 1460986
There are 140986 total unique exons in the entire dataset.
To calculate the average and median length of an exon, we utilize the ‘mean’ and ‘median’ functions applied to the ‘width’ column, which represents the exon width.
# Calculate the mean (average) length
average_length <- mean(df$width)
# Calculate the median length
average_median <- median(df$width)
# Print the results
average_length
## [1] 262.9931
average_median
## [1] 130
Inorder to calculate the length of introns we need to handle the positive strands and negative strands differently because
Length of intron = Start-co-ordinate of exon - end-co-ordinate of previous exon
For positive strands,the end coordinate of previous exon would be the end cordinate of the above row
For negative strands,the end coordinate of previos exon would be end coordinate of the row below
# Load necessary libraries
library(dplyr)
library(tidyr)
# Separate the DataFrame into positive and negative strand DataFrames
positive_strand_df <- df %>%
filter(strand == '+')
negative_strand_df <- df %>%
filter(strand == '-')
# Sort the positive strand DataFrame by 'transcript_id' and 'rank'
positive_strand_df <- positive_strand_df %>%
arrange(transcript_id, rank)
# Calculate intron lengths for positive strand exons
positive_strand_df <- positive_strand_df %>%
group_by(transcript_id) %>%
mutate(intron_length = start - lag(end, default =NA ) - 1)
positive_strand_df$intron_length <- ifelse(is.na(positive_strand_df$intron_length), 0, positive_strand_df$intron_length)
positive_strand_df$intron_length <- as.integer(positive_strand_df$intron_length)
# Sort the negative strand DataFrame by 'transcript_id' and 'rank'
negative_strand_df <- negative_strand_df %>%
arrange(transcript_id, rank)
# Calculate intron lengths for negative strand exons
negative_strand_df <- negative_strand_df %>%
group_by(transcript_id) %>%
mutate(intron_length = start - lead(end, default = NA)-1)
negative_strand_df$intron_length <- ifelse(is.na(negative_strand_df$intron_length), 0, negative_strand_df$intron_length)
negative_strand_df$intron_length <- as.integer(negative_strand_df$intron_length)
# Combine positive and negative strand DataFrames row-wise
combined_df <- bind_rows(positive_strand_df, negative_strand_df)
# Fill NA values in the 'intron_length' column with 0 and convert to integer
combined_df$intron_length <- ifelse(is.na(combined_df$intron_length), 0, combined_df$intron_length)
combined_df$intron_length <- as.integer(combined_df$intron_length)
# Sort the combined DataFrame by 'transcript_id' and 'rank'
combined_df <- combined_df %>%
arrange(transcript_id, rank)
# Print the resulting DataFrame
print(combined_df)
## # A tibble: 1,460,986 × 11
## # Groups: transcript_id [234,485]
## transcript_id transc…¹ seque…² start end width strand exon_id exon_…³ rank
## <int> <chr> <fct> <int> <int> <int> <fct> <int> <chr> <int>
## 1 1 ENST000… chr1 11869 12227 359 + 1 ENSE00… 1
## 2 1 ENST000… chr1 12613 12721 109 + 5 ENSE00… 2
## 3 1 ENST000… chr1 13221 14409 1189 + 8 ENSE00… 3
## 4 2 ENST000… chr1 12010 12057 48 + 2 ENSE00… 1
## 5 2 ENST000… chr1 12179 12227 49 + 3 ENSE00… 2
## 6 2 ENST000… chr1 12613 12697 85 + 4 ENSE00… 3
## 7 2 ENST000… chr1 12975 13052 78 + 6 ENSE00… 4
## 8 2 ENST000… chr1 13221 13374 154 + 7 ENSE00… 5
## 9 2 ENST000… chr1 13453 13670 218 + 9 ENSE00… 6
## 10 3 ENST000… chr1 29554 30039 486 + 10 ENSE00… 1
## # … with 1,460,976 more rows, 1 more variable: intron_length <int>, and
## # abbreviated variable names ¹​transcript_name, ²​sequence_names, ³​exon_name
Inorder to calculate l1, l2, u1, u2 for each exon(except the first and last exon) we need both the intron length on left side (that is the current intron length) and the intron length on right side (that is the intron length of next exon)
So first lets create a column which also stores the intron length of the next exon
As for calculating intron length we need to divide the dataset into 2 parts (one for positive strands and one for negative strands)
The same way we will divide it and calculate the next intron length
# Load necessary libraries
library(dplyr)
# For negative strand DataFrame:
# Calculate the previous intron length within each transcript group
negative_strand_df <- negative_strand_df %>%
group_by(transcript_id) %>%
mutate(next_intron_length = lag(intron_length, default = 0)) %>%
ungroup() %>%
mutate(next_intron_length = as.integer(next_intron_length))
# For positive strand DataFrame:
# Calculate the previous intron length within each transcript group
positive_strand_df <- positive_strand_df %>%
group_by(transcript_id) %>%
mutate(next_intron_length = lead(intron_length, default = 0)) %>%
ungroup() %>%
mutate(next_intron_length = as.integer(next_intron_length))
# Combine positive and negative strand DataFrames row-wise
complete_df <- bind_rows(positive_strand_df, negative_strand_df)
# Fill NA values in the 'intron_length' column with 0 and convert to integer
complete_df$intron_length <- ifelse(is.na(complete_df$intron_length), 0, complete_df$intron_length)
complete_df$intron_length <- as.integer(complete_df$intron_length)
# Sort the combined DataFrame by 'transcript_id' and 'rank'
complete_df <- complete_df %>%
arrange(transcript_id, rank)
# Print the resulting DataFrame
print(complete_df)
## # A tibble: 1,460,986 × 12
## transcript_id transc…¹ seque…² start end width strand exon_id exon_…³ rank
## <int> <chr> <fct> <int> <int> <int> <fct> <int> <chr> <int>
## 1 1 ENST000… chr1 11869 12227 359 + 1 ENSE00… 1
## 2 1 ENST000… chr1 12613 12721 109 + 5 ENSE00… 2
## 3 1 ENST000… chr1 13221 14409 1189 + 8 ENSE00… 3
## 4 2 ENST000… chr1 12010 12057 48 + 2 ENSE00… 1
## 5 2 ENST000… chr1 12179 12227 49 + 3 ENSE00… 2
## 6 2 ENST000… chr1 12613 12697 85 + 4 ENSE00… 3
## 7 2 ENST000… chr1 12975 13052 78 + 6 ENSE00… 4
## 8 2 ENST000… chr1 13221 13374 154 + 7 ENSE00… 5
## 9 2 ENST000… chr1 13453 13670 218 + 9 ENSE00… 6
## 10 3 ENST000… chr1 29554 30039 486 + 10 ENSE00… 1
## # … with 1,460,976 more rows, 2 more variables: intron_length <int>,
## # next_intron_length <int>, and abbreviated variable names ¹​transcript_name,
## # ²​sequence_names, ³​exon_name
#copying the df
copy_of_complete_dataframe <- complete_df
If the length, next length and width are greater than 200 then calculating them is much easier. We simply use the below formulas
Lets consider each one and understand what will happen if these (length, next length or width are greater than 200)
L1
According to the question L1 is 100 units of length before sj (start-cordinate) Hence L1 = start - 100
If we take the first exon L1 would be zero Since transcripts start only with an exon there is no space for l1
So l1 basicaly depends on the intron length If intron length is 0 then L1=0 I have add the code for this statement on
line 14
But if intron length is less than 200 then l1 will become start minus half of intron length L1=start-intron_length/2L2
According to the question L2 is 100 units of length after sj (start-cordinate) Hence L2 = start + 100
So l2 basicaly depends on the exon width
If width less than 200 then l2 will become start plus half of width L2=start+exon_width/2u1
According to the question u1 is 100 units of length before sj(end-cordinate) Hence u1 = end - 100
So u1 also basicaly depends on the exon width
If width less than 200 then u1 will become end minus half of exon width L2=end - exon_width/2u2 According to the question u2 is 100 units of length after sj(end-cordinate) Hence u2 = end +100 If we take the last exon u2 would be zero Since transcripts end only with an exon there is no space for u2 So u2 basicaly depends on the next intron length If next intron length is 0 then u2=0 I have add the code for the same But if next intron length is less than 200 then u2 will become end minus half of next intron length L2=start-intron_length/2
Incase if you want the dataframe to not contain values where L2=u1 then please uncomment the lines. It will result in the dataframe where L1 cannot be equal to u1
# Define a function to calculate positions
calculate_positions <- function(exon) {
# Extract data from the exon
start <- exon$start
end <- exon$end
length <- exon$intron_length
width <- exon$width
next_intron_length <- exon$next_intron_length
# Define inner functions to calculate L1, L2, U1, and U2
calculate_l1 <- function(start, length) {
if (length <= 200) {
if (length == 0) {
l1 <- 0
} else {
if (length %% 2 == 0) {
length <- length - 1
}
# else { length <- length - 2}
l1 <- start - (length / 2)
}
} else {
l1 <- start - 100
}
return(l1)
}
calculate_l2 <- function(start, width) {
if (width <= 200) {
if (width %% 2 == 0) {
width <- width - 1
}
#else {width <- width - 2}
l2 <- start + (width / 2)
} else {
l2 <- start + 100
}
return(l2)
}
calculate_u1 <- function(end, width) {
if (width <= 200) {
if (width %% 2 == 0) {
width <- width - 1
}
#else {width <- width - 2}
u1 <- end - (width / 2)
} else {
u1 <- end - 100
}
return(u1)
}
calculate_u2 <- function(end, next_intron_length) {
if (next_intron_length != 0) {
if (next_intron_length <= 200) {
if (next_intron_length %% 2 == 0) {
next_intron_length <- next_intron_length - 1
}
#else {next_intron_length <- next_intron_length - 2}
u2 <- end + (next_intron_length / 2)
} else {
u2 <- end + 100
}
} else {
u2 <- 0
}
return(u2)
}
# Calculate the positions
l1 <- calculate_l1(start, length)
l2 <- calculate_l2(start, width)
u1 <- calculate_u1(end, width)
u2 <- calculate_u2(end, next_intron_length)
# Round the positions using floor and ceiling
l1 <- ceiling(l1)
l2 <- floor(l2)
u1 <- ceiling(u1)
u2 <- floor(u2)
# Return the calculated positions
return(data.frame(L1 = l1, L2 = l2, U1 = u1, U2 = u2))
}
# Apply the calculate_positions function to the DataFrame and create new columns
complete_df <- complete_df %>%
rowwise() %>%
do(calculate_positions(.)) %>%
bind_cols(complete_df)
# Sort the combined DataFrame by 'transcript_id' and 'rank'
complete_df <- complete_df %>%
arrange(transcript_id, rank)
complete_df
##Optimized code for Bonus problem
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
# function which calculates the values of L1 L2 U1 U2
calculate_positions_vectorized <- function(df) {
df <- df %>%
mutate(
half_intron_length = ifelse(intron_length == 0, 0, ifelse(intron_length %% 2 == 0, intron_length - 1, intron_length) / 2),
half_exon_length = ifelse(width %% 2 == 0, width - 1, width) / 2,
next_half_intron_length = ifelse(next_intron_length == 0, 0, ifelse(next_intron_length %% 2 == 0, next_intron_length - 1, next_intron_length) / 2),
L1 = ifelse(intron_length == 0, 0, ceiling(start - pmin(100, half_intron_length))),
L2 = floor(start + pmin(100, half_exon_length)),
U1 = ceiling(end - pmin(100, half_exon_length)),
U2 = ifelse(next_intron_length == 0, 0, floor(end + pmin(100, next_half_intron_length)))
)
return(df)
}
# Apply the calculate_positions_vectorized function to the data frame 'df'
final_df <- calculate_positions_vectorized(copy_of_complete_dataframe)
# Sort the combined data frame by 'transcript_id' and 'rank'
final_df <- final_df %>%
arrange(transcript_id, rank)
# Create a list of column names to exclude
columns_to_exclude <- c("next_intron_length", "half_exon_length","half_intron_length","next_half_intron_length"
)
# Select only the columns that are NOT in the list of columns to exclude
filtered_df <- final_df[, !names(final_df) %in% columns_to_exclude]
filtered_df